root_dir <- "/g/huber/users/fridljand/R/ihw-forest-paper/"
load(file.path(root_dir, "data/hqtl_chrom1_chrom2/snps1.filtered.allGT.txt.rda"))
snps1 <- snps
load(file.path(root_dir, "data/hqtl_chrom1_chrom2/snps2.filtered.allGT.txt.rda"))
snps2 <- snps

chr1_maf_download <- readRDS(file = file.path(root_dir, "data/downloaded_covariates/chr1_maf.Rds"))
chr2_maf_download <- readRDS(file = file.path(root_dir, "data/downloaded_covariates/chr2_maf.Rds"))

unique(unlist(apply(snps1[[1]], 1, unique)))
## [1] 0 1 2
#http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/faq.html
maf.list = vector('list', length(snps1))
for(sl in 1:length(snps1)) {
  slice = snps1[[sl]];
  maf.list[[sl]] = rowMeans(slice,na.rm=TRUE)/2;
  maf.list[[sl]] = pmin(maf.list[[sl]],1-maf.list[[sl]]);
}
chr1_maf_sample <- data.frame(
  refsnp_id = MatrixEQTL::rownames(snps1),
  minor_allele_freq =  unlist(maf.list)
)
hist(chr1_maf_sample$minor_allele_freq)
axis(1, at = seq(0.05, 1, by = 0.1), las=2)

saveRDS(chr1_maf_sample, file = file.path(root_dir, "data/hqtl_chrom1_chrom2/chr1_maf_sample.rds"))
#http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/faq.html
maf.list2 = vector('list', length(snps2))
for(sl in 1:length(snps2)) {
  slice = snps2[[sl]];
  maf.list2[[sl]] = rowMeans(slice,na.rm=TRUE)/2;
  maf.list2[[sl]] = pmin(maf.list2[[sl]],1-maf.list2[[sl]]);
}
chr2_maf_sample <- data.frame(
  refsnp_id = MatrixEQTL::rownames(snps2),
  minor_allele_freq =  unlist(maf.list2)
)
hist(chr2_maf_sample$minor_allele_freq)

saveRDS(chr2_maf_sample, file = file.path(root_dir, "data/hqtl_chrom1_chrom2/chr2_maf_sample.rds"))

Plot downloaded maf against sample

chr1_maf_sample_download <- dplyr::inner_join(chr1_maf_sample %>% rename(minor_allele_freq_sample = minor_allele_freq),
                                       chr1_maf_download %>% rename(minor_allele_freq_download = minor_allele_freq),
                                       by = "refsnp_id")
chr1_maf_sample_download <- chr1_maf_sample_download %>% drop_na(minor_allele_freq_download)

chr1_maf_sample_download <- chr1_maf_sample_download[sample(nrow(chr1_maf_sample_download), 3000), ]

#cor(chr1_maf_sample_download$minor_allele_freq_sample,chr1_maf_sample_download$minor_allele_freq_download)
plot(chr1_maf_sample_download$minor_allele_freq_sample, 
     chr1_maf_sample_download$minor_allele_freq_download,
     col = alpha("black", 0.3), 
     pch=16)
abline(coef = c(0,1))

Does not look very correlated.

Hardy-Weinberg equilibrium

sl = 1
sample_size = ncol(snps1)-1 #TODO -1?
maf_list_slice =  chr1_maf_sample$minor_allele_freq[1:2000]
snps1_slice = snps1[[sl]];
p = maf_list_slice
q = 1-p # Calulate frequency of minor allele being present in homozygous and heterozygous state
f_dom_hom = q^2
f_hom = p^2 
f_het = 2*p*q # Expected number of observations in a sample size of 300 

expected_f_hom_het <- data.frame(
  expected_0 = round(f_dom_hom * sample_size),
  expected_1=round(f_het * sample_size),
  expected_2 =round(f_hom * sample_size)
)
observed_f_hom_het <- apply(snps1_slice, 1, table)

observed_f_hom_het <- observed_f_hom_het %>%

  lapply(function (df){
    df <- as.data.frame(df)
    pivot_wider(df, names_from = "Var1", values_from = "Freq")
  })

observed_f_hom_het <- observed_f_hom_het %>%
  data.table::rbindlist(use.names=TRUE, fill=TRUE)

observed_f_hom_het[is.na(observed_f_hom_het)] <- 0
expected_observed_f_hom_het <- cbind(expected_f_hom_het, observed_f_hom_het)
ggplot(expected_observed_f_hom_het, aes(x = expected_0, y = `0`))+
  geom_point() +
  geom_abline(slope = 1)

ggplot(expected_observed_f_hom_het, aes(x = expected_1, y = `1`))+
  geom_point() +
  geom_abline(slope = 1)

ggplot(expected_observed_f_hom_het, aes(x = expected_2, y = `2`))+
  geom_point() +
  geom_abline(slope = 1)